Path: blob/master/Part 1 - Data Preprocessing/[Python] Data Preprocessing.ipynb
1002 views
Kernel: Python 3
Data Preprocessing
Importing the libraries
In [1]:
Import the dataset
In [2]:
In [3]:
Out[3]:
In [4]:
In [5]:
Out[5]:
array([['France', 44.0, 72000.0],
['Spain', 27.0, 48000.0],
['Germany', 30.0, 54000.0],
['Spain', 38.0, 61000.0],
['Germany', 40.0, nan],
['France', 35.0, 58000.0],
['Spain', nan, 52000.0],
['France', 48.0, 79000.0],
['Germany', 50.0, 83000.0],
['France', 37.0, 67000.0]], dtype=object)
In [6]:
In [7]:
Out[7]:
array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'], dtype=object)
Taking care of missing data
In [8]:
In [9]:
Out[9]:
array([['France', 44.0, 72000.0],
['Spain', 27.0, 48000.0],
['Germany', 30.0, 54000.0],
['Spain', 38.0, 61000.0],
['Germany', 40.0, 63777.77777777778],
['France', 35.0, 58000.0],
['Spain', 38.77777777777778, 52000.0],
['France', 48.0, 79000.0],
['Germany', 50.0, 83000.0],
['France', 37.0, 67000.0]], dtype=object)
Encoding categorical data
In [10]:
In [11]:
Out[11]:
array([[0, 44.0, 72000.0],
[2, 27.0, 48000.0],
[1, 30.0, 54000.0],
[2, 38.0, 61000.0],
[1, 40.0, 63777.77777777778],
[0, 35.0, 58000.0],
[2, 38.77777777777778, 52000.0],
[0, 48.0, 79000.0],
[1, 50.0, 83000.0],
[0, 37.0, 67000.0]], dtype=object)
In [12]:
In [13]:
Out[13]:
array([[ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
4.40000000e+01, 7.20000000e+04],
[ 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
2.70000000e+01, 4.80000000e+04],
[ 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
3.00000000e+01, 5.40000000e+04],
[ 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
3.80000000e+01, 6.10000000e+04],
[ 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
4.00000000e+01, 6.37777778e+04],
[ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
3.50000000e+01, 5.80000000e+04],
[ 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
3.87777778e+01, 5.20000000e+04],
[ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
4.80000000e+01, 7.90000000e+04],
[ 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
5.00000000e+01, 8.30000000e+04],
[ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
3.70000000e+01, 6.70000000e+04]])
In [14]:
In [15]:
Out[15]:
array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])
Splitting the dataset into the Training set and Test set
In [16]:
In [17]:
Out[17]:
array([[ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
3.50000000e+01, 5.80000000e+04],
[ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
4.40000000e+01, 7.20000000e+04],
[ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
4.80000000e+01, 7.90000000e+04],
[ 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
3.00000000e+01, 5.40000000e+04],
[ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
3.70000000e+01, 6.70000000e+04],
[ 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
4.00000000e+01, 6.37777778e+04],
[ 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
3.80000000e+01, 6.10000000e+04],
[ 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
3.87777778e+01, 5.20000000e+04]])
In [18]:
Out[18]:
8
In [19]:
Out[19]:
array([[ 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
5.00000000e+01, 8.30000000e+04],
[ 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
2.70000000e+01, 4.80000000e+04]])
In [20]:
Out[20]:
2
In [21]:
Out[21]:
array([1, 0, 1, 0, 1, 1, 0, 0])
In [22]:
Out[22]:
8
In [23]:
Out[23]:
array([0, 1])
In [24]:
Out[24]:
2
Feature Scaling
In [25]:
In [26]:
Out[26]:
array([[ 1. , -0.57735027, -0.57735027, -0.7529426 , -0.62603778],
[ 1. , -0.57735027, -0.57735027, 1.00845381, 1.01304295],
[ 1. , -0.57735027, -0.57735027, 1.79129666, 1.83258331],
[-1. , 1.73205081, -0.57735027, -1.73149616, -1.09434656],
[ 1. , -0.57735027, -0.57735027, -0.36152118, 0.42765698],
[-1. , 1.73205081, -0.57735027, 0.22561096, 0.05040824],
[-1. , -0.57735027, 1.73205081, -0.16581046, -0.27480619],
[-1. , -0.57735027, 1.73205081, -0.01359102, -1.32850095]])
In [27]:
Out[27]:
array([[-1. , 1.73205081, -0.57735027, 2.18271808, 2.30089209],
[-1. , -0.57735027, 1.73205081, -2.3186283 , -1.79680973]])
In [ ]: